//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information. 
//

/*++



Module Name:

    optimizedtls.cpp

Abstract:

    Implementation of platform-specific Thread local storage functions.



--*/

#include "pal/thread.hpp"
#include "pal/malloc.hpp"

#include <pthread.h>

#include "pal/dbgmsg.h"
#include "pal/misc.h"
#include "pal/debug.h"

#include <stddef.h>

using namespace CorUnix;

SET_DEFAULT_DEBUG_CHANNEL(THREAD);

#if defined(USE_OPTIMIZEDTLSGETTER)

#define PAL_safe_offsetof(s,m) ((size_t)((ptrdiff_t)&(char&)(((s *)64)->m))-64)

/*++
Function:
    CorUnix::TLSMakeOptimizedGetter

    Creates a platform-optimized version of TlsGetValue compiled
    for a particular index.

    Generates the hot part of CorUnix::InternalGetCurrentThread
    as a chunk of highly optimized machine-specific code at runtime.

    Check the difference between CorUnix::InternalGetCurrentThread and
    CorUnix::InternalGetCurrentThreadSlow to see the C/C++ code that matches
    the code generated by this function.
--*/
PAL_POPTIMIZEDTLSGETTER
CorUnix::TLSMakeOptimizedGetter(
        IN CPalThread* pThread,
        IN DWORD dwTlsIndex)
{
#ifdef BIT64
#pragma unused(pThread, dwTlsIndex)
    ERROR("TLSMakeOptimizedGetter not rewritten for amd64 yet.");
    return NULL;
#else
    PAL_POPTIMIZEDTLSGETTER Ret = NULL;
    BYTE* p;
    int i = 0;

#ifdef __APPLE__
#define TLS_OPTIMIZED_GETTER_SIZE 118
#else
#define TLS_OPTIMIZED_GETTER_SIZE 115
#endif

    p = (BYTE*)InternalMalloc(pThread, TLS_OPTIMIZED_GETTER_SIZE * sizeof(BYTE));

    if (p == NULL)
    {
        return Ret;
    }

    // Need to preserve %ecx, %edx, and %esi registers as specified in
    // GetThreadGeneric(void) in vm/i386/asmhelpers.s
    p[i++] = 0x51; // push %ecx
    p[i++] = 0x52; // push %edx
    p[i++] = 0x89; // mov %esp,%eax // %eax = sp;
    p[i++] = 0xe0;
    p[i++] = 0xc1; // shr $0x11,%eax // sp >> 17;
    p[i++] = 0xe8;
    p[i++] = 0x11;
    p[i++] = 0x89; // mov %eax,%edx // key = sp >> 17;
    p[i++] = 0xc2;
    p[i++] = 0xc1; // sar $0x7,%edx // key >> 7;
    p[i++] = 0xfa;
    p[i++] = 0x07;
    p[i++] = 0x29; // sub %edx,%eax // key -= key >> 7;
    p[i++] = 0xd0;
    p[i++] = 0x89; // mov %eax,%edx
    p[i++] = 0xc2;
    p[i++] = 0xc1; // sar $0x5,%edx // key >> 5;
    p[i++] = 0xfa;
    p[i++] = 0x05;
    p[i++] = 0x29; // sub %edx,%eax // key -= key >> 5;
    p[i++] = 0xd0;
    p[i++] = 0x89; // mov %eax,%edx
    p[i++] = 0xc2;
    p[i++] = 0xc1; // sar $0x3,%edx // key >> 3;
    p[i++] = 0xfa;
    p[i++] = 0x03;
    p[i++] = 0x29; // sub %edx,%eax // key -= key >> 3;
    p[i++] = 0xd0;
    p[i++] = 0x25; // and $0xff,%eax // key &= 0xFF;
    p[i++] = 0xff;
    p[i++] = 0x00;
    p[i++] = 0x00;
    p[i++] = 0x00;
    p[i++] = 0x8b; // mov (flush_counter),%ecx // %ecx = counter = flush_counter;
    p[i++] = 0x0d;
    *((DWORD*) &p[i]) = (DWORD)&flush_counter;
    i += sizeof(DWORD);
    p[i++] = 0x8b; // mov (thread_hints,%eax,4),%eax // %edx = pThread = thread_hints[key];
    p[i++] = 0x14;
    p[i++] = 0x85;
    *((DWORD*) &p[i]) = (DWORD)&thread_hints;
    i += sizeof(DWORD);
    p[i++] = 0x39; // cmp %esp,offsetof(CPalThread,tlsInfo)+offsetof(CThreadTLSInfo,minStack)(%edx)
                   // if ((size_t)pThread->tlsInfo.minStack <= sp)
    p[i++] = 0xa2;
    *((DWORD*) &p[i]) = (DWORD)(PAL_safe_offsetof(CPalThread,tlsInfo)+PAL_safe_offsetof(CThreadTLSInfo,minStack));
    i += sizeof(DWORD);
    p[i++] = 0x77; // ja CallInternalGetCurrentThreadSlow:
    p[i++] = 0x19;
    p[i++] = 0x3b; // cmp offsetof(CPalThread,tlsInfo)+offsetof(CThreadTLSInfo,maxStack)(%edx),%esp
                   // if (sp < (size_t)pThread->tlsInfo.maxStack)
    p[i++] = 0xa2;
    *((DWORD*) &p[i]) = (DWORD)(PAL_safe_offsetof(CPalThread,tlsInfo)+PAL_safe_offsetof(CThreadTLSInfo,maxStack));
    i += sizeof(DWORD);
    p[i++] = 0x73; // jae CallInternalGetCurrentThreadSlow:
    p[i++] = 0x11;
    p[i++] = 0x39; // cmp (flush_counter),%ecx // if (counter == flush_counter)
    p[i++] = 0x0d;
    *((DWORD*) &p[i]) = (DWORD)&flush_counter;
    i += sizeof(DWORD);
    p[i++] = 0x75; // jne CallInternalGetCurrentThreadSlow:
    p[i++] = 0x09;
    if (dwTlsIndex != THREAD_OBJECT_TLS_INDEX)
    {
        p[i++] = 0x8b; // mov offsetof(pThread->tlsSlots[dwTlsIndex])(%edx),%eax // %eax = pThread->tlsSlots[dwTlsIndex];
        p[i++] = 0x82;
        *((DWORD*) &p[i]) = (DWORD)(PAL_safe_offsetof(CPalThread,tlsInfo)+PAL_safe_offsetof(CThreadTLSInfo,tlsSlots[dwTlsIndex]));
        i += sizeof(DWORD);
    }
    else
    {
        p[i++] = 0x89; // mov %edx,%eax // %eax = pThread;
        p[i++] = 0xd0;
        p[i++] = 0x90; // nop
        p[i++] = 0x90; // nop
        p[i++] = 0x90; // nop
        p[i++] = 0x90; // nop
    }
    p[i++] = 0x5a; // pop %edx
    p[i++] = 0x59; // pop %ecx
    p[i++] = 0xc3; // ret
    // CallInternalGetCurrentThreadSlow:
    p[i++] = 0x5a; // pop %edx
    p[i++] = 0x59; // pop %ecx
    p[i++] = 0x8d; // lea (thread_hints,%eax,4),%eax // %eax = &thread_hints[key];
    p[i++] = 0x04;
    p[i++] = 0x85;
    *((DWORD*) &p[i]) = (DWORD)&thread_hints;
    i += sizeof(DWORD);
    p[i++] = 0x55; // push %ebp
    p[i++] = 0x89; // mov %esp,%ebp
    p[i++] = 0xe5;
    p[i++] = 0x51; // push %ecx
    p[i++] = 0x89; // mov %esp,%ecx // this is the reference esp - need to match the reference esp used in the fast path.
    p[i++] = 0xe1;
    p[i++] = 0x52; // push %edx
#ifdef __APPLE__
    // establish 16-byte stack alignment
    p[i++] = 0x83; // subl $8,%esp
    p[i++] = 0xec;
    p[i++] = 0x08;
#endif
    p[i++] = 0x50; // push %eax // store &thread_hints[key] on stack as 2nd argument;
    p[i++] = 0x51; // push %ecx // reference esp - The 1st argument for call to InternalGetCurrentThreadSlow.
    p[i++] = 0xe8; // call InternalGetCurrentThreadSlow
    *((DWORD*) &p[i]) = (DWORD)&InternalGetCurrentThreadSlow - (DWORD)(&p[i+sizeof(DWORD)]);
    i += sizeof(DWORD);
#ifdef __APPLE__
    p[i++] = 0x83; // addl $16,%esp
    p[i++] = 0xc4;
    p[i++] = 0x10;
#else
    p[i++] = 0x83; // addl $8,%esp
    p[i++] = 0xc4;
    p[i++] = 0x08;
#endif
    if (dwTlsIndex != THREAD_OBJECT_TLS_INDEX)
    {
        p[i++] = 0x8b; // mov offsetof(pThread->tlsSlots[dwTlsIndex])(%eax),%eax  // %eax = pThread->tlsSlots[dwTlsIndex];
        p[i++] = 0x80;
        *((DWORD*) &p[i]) = (DWORD)(PAL_safe_offsetof(CPalThread,tlsInfo)+PAL_safe_offsetof(CThreadTLSInfo,tlsSlots[dwTlsIndex]));
        i += sizeof(DWORD);
    }
    p[i++] = 0x5a; // pop %edx
    p[i++] = 0x59; // pop %ecx
    p[i++] = 0xc9; // leave
    p[i++] = 0xc3; // ret

    if (i > TLS_OPTIMIZED_GETTER_SIZE)
    {
        ASSERT("Invalid TLS_OPTIMIZED_GETTER_SIZE %d\n", i);
    }

    DBG_FlushInstructionCache(p, TLS_OPTIMIZED_GETTER_SIZE * sizeof(BYTE));

    Ret = (PAL_POPTIMIZEDTLSGETTER)p;

    return Ret;
#endif // BIT64 else
}

/*++
Function:
    TLSFreeOptimizedGetter

    Frees a function created by MakeOptimizedTlsGetter().
--*/
VOID
CorUnix::TLSFreeOptimizedGetter(
        IN PAL_POPTIMIZEDTLSGETTER pOptimizedTlsGetter)
{
    InternalFree(InternalGetCurrentThread(), (void *)pOptimizedTlsGetter);
}

#endif // USE_OPTIMIZEDTLSGETTER
